/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.fusesource.hawtdb.internal.page; import org.fusesource.hawtbuf.Buffer; import org.fusesource.hawtbuf.DataByteArrayInputStream; import org.fusesource.hawtbuf.DataByteArrayOutputStream; import org.fusesource.hawtdb.api.*; import org.fusesource.hawtdb.api.Paged.SliceType; import org.fusesource.hawtdb.internal.io.MemoryMappedFile; import org.fusesource.hawtdb.internal.util.Ranges; import org.fusesource.hawtdb.util.list.LinkedNodeList; import java.io.IOException; import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import java.io.UnsupportedEncodingException; import java.nio.ByteBuffer; import java.util.*; import java.util.Map.Entry; import java.util.concurrent.*; import java.util.zip.CRC32; import static org.fusesource.hawtdb.internal.page.Logging.*; /** * Provides concurrent page file access via Multiversion concurrency control * (MVCC). * * Once a transaction begins working against the data, it acquires a snapshot of * all the data in the page file. This snapshot is used to provides the * transaction consistent view of the data in spite of it being concurrently * modified by other transactions. * * When a transaction does a page update, the update is stored in a temporary * page location. Subsequent reads of the original page will result in page read * of the temporary page. If the transaction rolls back, the temporary pages are * freed. If the transaction commits, the page updates are assigned the next * snapshot version number and the update gets queued so that it can be applied * atomically at a later time. * * @author <a href="http://hiramchirino.com">Hiram Chirino</a> */ public final class HawtTxPageFile implements TxPageFile { public static final int FILE_HEADER_SIZE = 1024 * 4; public static final byte[] MAGIC = magic(); private static byte[] magic() { try { byte rc[] = new byte[32]; byte[] tmp = "HawtDB:1.0\n".getBytes("UTF-8"); System.arraycopy(tmp, 0, rc, 0, tmp.length); return rc; } catch (UnsupportedEncodingException e) { throw new RuntimeException(e); } } /** * The first 4K of the file is used to hold 2 copies of the header. * Each copy is 2K big. The header is checksummed so that corruption * can be detected. */ static private class Header { /** Identifies the file format */ public volatile byte[] magic = new byte[32]; /** The oldest applied commit revision */ public volatile long base_revision; /** The size of each page in the page file */ public volatile int page_size; /** The page location of the free page list */ public volatile int free_list_page; /** Where it is safe to resume recovery... Will be * -1 if no recovery is needed. */ public volatile int pessimistic_recovery_page; /** We try to recover from this point.. but it may fail since it's * writes have not been synced to disk. */ public volatile int optimistic_recovery_page; public String toString() { return "{ base_revision: " + this.base_revision + ", page_size: " + page_size + ", free_list_page: " + free_list_page + ", pessimistic_recovery_page: " + pessimistic_recovery_page + ", optimistic_recovery_page: " + optimistic_recovery_page + " }"; } private final DataByteArrayOutputStream os = new DataByteArrayOutputStream(FILE_HEADER_SIZE); Buffer encode() { try { os.reset(); os.write(magic); os.writeLong(base_revision); os.writeInt(page_size); os.writeInt(free_list_page); os.writeInt(pessimistic_recovery_page); os.writeInt(optimistic_recovery_page); int length = os.position(); byte[] data = os.getData(); CRC32 checksum = new CRC32(); checksum.update(data, 0, length); os.position((FILE_HEADER_SIZE / 2) - 8); os.writeLong(checksum.getValue()); System.arraycopy(data, 0, data, FILE_HEADER_SIZE / 2, length); os.position(FILE_HEADER_SIZE / 2 - 8); os.writeLong(checksum.getValue()); return os.toBuffer(); } catch (IOException e) { throw new RuntimeException(e); } } void decode(Buffer buffer) throws PagingException { DataByteArrayInputStream is = new DataByteArrayInputStream(buffer); int length = readFields(is); is.setPos((FILE_HEADER_SIZE / 2) - 8); long expectedChecksum = is.readLong(); CRC32 checksum = new CRC32(); checksum.update(buffer.data, 0, length); if (checksum.getValue() != expectedChecksum) { // Try the 2nd copy.. is.setPos(FILE_HEADER_SIZE / 2); length = readFields(is); is.setPos(FILE_HEADER_SIZE - 8); expectedChecksum = is.readLong(); checksum = new CRC32(); checksum.update(buffer.data, 0, length); if (checksum.getValue() != expectedChecksum) { throw new PagingException("file header corruption detected."); } } } private int readFields(DataByteArrayInputStream is) { is.readFully(magic); base_revision = is.readLong(); page_size = is.readInt(); free_list_page = is.readInt(); pessimistic_recovery_page = is.readInt(); optimistic_recovery_page = is.readInt(); int length = is.getPos(); return length; } } /** The header structure of the file */ private final Header header = new Header(); private final LinkedNodeList<Batch> batches = new LinkedNodeList<Batch>(); private final MemoryMappedFile file; final Allocator allocator; final HawtPageFile pageFile; private static final int updateBatchSize = 1024; private final boolean synch; private volatile int lastBatchPage = -1; // // The following batch objects point to linked nodes in the previous batch list. // They are used to track/designate the state of the batch object. // /** The current batch that is currently being assembled. */ volatile Batch openBatch; /** The batches that are being stored... These might be be recoverable. */ volatile Batch storingBatches; /** The stored batches. */ volatile Batch storedBatches; /** The performed batches. Page updates have been copied from the redo pages to the original page locations. */ volatile Batch performedBatches; /** A read cache used to speed up access to frequently used pages */ volatile ReadCache readCache; // // Profilers like yourkit just tell which mutex class was locked.. so create a different class for each mutex // so we can more easily tell which mutex was locked. // private static class HOUSE_KEEPING_MUTEX { public String toString() { return "HOUSE_KEEPING_MUTEX"; } } private static class TRANSACTION_MUTEX { public String toString() { return "TRANSACTION_MUTEX"; } } /** * Mutex for data structures which are used during house keeping tasks like batch * management. Once acquired, you can also acquire the TRANSACTION_MUTEX */ private final HOUSE_KEEPING_MUTEX HOUSE_KEEPING_MUTEX = new HOUSE_KEEPING_MUTEX(); /** * Mutex for data structures which transaction threads access. Never attempt to * acquire the HOUSE_KEEPING_MUTEX once this mutex is acquired. */ final TRANSACTION_MUTEX TRANSACTION_MUTEX = new TRANSACTION_MUTEX(); /** * This is the free page list at the base revision. It does not * track allocations in transactions or committed updates. Only * when the updates are performed will this list be updated. * * The main purpose of this list is to initialize the free list * on recovery. * * This does not track the space associated with batch lists * and free lists. On recovery that space is discovered and * tracked in the page file allocator. */ private Ranges storedFreeList = new Ranges(); private final ExecutorService worker; public HawtTxPageFile(TxPageFileFactory factory, HawtPageFile pageFile) { this.pageFile = pageFile; this.synch = factory.isSync(); this.file = pageFile.getFile(); this.allocator = pageFile.allocator(); this.readCache = new ReadCache(pageFile, factory.getPageCache()); if (factory.isUseWorkerThread()) { worker = Executors.newSingleThreadExecutor(new ThreadFactory() { public Thread newThread(Runnable r) { Thread rc = new Thread(r); rc.setName("HawtDB Worker"); rc.setDaemon(true); return rc; } }); } else { worker = null; } } public ReadCache readCache() { return readCache; } public void close() { if (worker != null) { final CountDownLatch done = new CountDownLatch(1); worker.execute(new Runnable() { public void run() { done.countDown(); worker.shutdownNow(); } }); try { done.await(); } catch (InterruptedException e) { } } flush(); performBatches(); } @Override public String toString() { return "{\n" + " allocator: " + allocator + ",\n" + " synch: " + synch + ",\n" + " read cache size: " + readCache.cache().size() + ",\n" + " base revision free pages: " + storedFreeList + ",\n" + " batches: {\n" + " performed: " + toString(performedBatches, storedBatches) + ",\n" + " stored: " + toString(storedBatches, storingBatches) + ",\n" + " storing: " + toString(storingBatches, openBatch) + ",\n" + " open: " + toString(openBatch, null) + ",\n" + " }" + "\n" + "}"; } /** * @param from * @param to * @return string representation of the batch items from the specified batch up to (exclusive) the specified batch. */ private String toString(Batch from, Batch to) { StringBuilder rc = new StringBuilder(); rc.append("[ "); Batch t = from; while (t != null && t != to) { if (t != from) { rc.append(", "); } rc.append(t); t = t.getNext(); } rc.append(" ]"); return rc.toString(); } /* (non-Javadoc) * @see org.fusesource.hawtdb.internal.page.TransactionalPageFile#tx() */ public Transaction tx() { return new HawtTransaction(this); } /** * Attempts to commit a set of page updates. * * @param snapshot * @param pageUpdates * @param flushCallbacks */ void commit(Snapshot snapshot, ConcurrentHashMap<Integer, Update> pageUpdates, ArrayList<Runnable> flushCallbacks) { boolean fullBatch = false; Commit commit = null; synchronized (TRANSACTION_MUTEX) { // we need to figure out the revision id of the this commit... long rev; if (snapshot != null) { // Lets check for an OptimisticUpdateException // verify that the new commit's updates don't conflict with a commit that occurred // subsequent to the snapshot that this commit started operating on. // Note: every deferred update has an entry in the pageUpdates, so no need to // check to see if that map also conflicts. rev = snapshot.getTracker().commitCheck(pageUpdates); snapshot.close(); } else { rev = openBatch.head; } rev++; if (flushCallbacks != null) { openBatch.flushCallbacks.addAll(flushCallbacks); } commit = openBatch.commits.getTail(); if (commit != null && commit.snapshotTracker == null) { // just merge /w the previous commit if it does not have an open snapshot. // TODO: we are inside the TRANSACTION_MUTEX ... and this seems CPU intensive.. // but it's better than always creating more commit entries.. as that slows down // page look up (the have to iterate through all the commits). commit.merge(pageFile.allocator(), rev, pageUpdates); } else { commit = new Commit(rev, pageUpdates); openBatch.commits.addLast(commit); } if (openBatch.base == -1) { openBatch.base = rev; } openBatch.head = rev; if (openBatch.pageCount() > updateBatchSize) { fullBatch = true; } } if (fullBatch) { trace("batch full."); synchronized (HOUSE_KEEPING_MUTEX) { storeBatches(false); } if (worker != null) { worker.execute(new Runnable() { public void run() { synchronized (HOUSE_KEEPING_MUTEX) { syncBatches(); } } }); } else { synchronized (HOUSE_KEEPING_MUTEX) { syncBatches(); } } } } /** * Used to initialize a new file or to clear out the * contents of an existing file. */ public void reset() { synchronized (HOUSE_KEEPING_MUTEX) { batches.clear(); performedBatches = storedBatches = storingBatches = openBatch = new Batch(-1); batches.addFirst(openBatch); lastBatchPage = -1; readCache.cache().clear(); allocator.clear(); storedFreeList.clear(); storedFreeList.add(0, allocator.getLimit()); // Initialize the file header.. System.arraycopy(MAGIC, 0, header.magic, 0, MAGIC.length); header.base_revision = -1; header.free_list_page = -1; header.page_size = pageFile.getPageSize(); header.pessimistic_recovery_page = -1; header.optimistic_recovery_page = -1; storeHeader(); } } /** * Loads an existing file and replays the batch * logs to put it in a consistent state. */ public void recover() { synchronized (HOUSE_KEEPING_MUTEX) { batches.clear(); performedBatches = storedBatches = storingBatches = openBatch = new Batch(-1); batches.addFirst(openBatch); lastBatchPage = -1; readCache.cache().clear(); Buffer buffer = new Buffer(FILE_HEADER_SIZE); file.read(0, buffer); header.decode(buffer); if (!Arrays.equals(MAGIC, header.magic)) { throw new PagingException("The file header is not of the expected type."); } trace("recovery started. header: %s", header); // Initialize the free page list. if (header.free_list_page >= 0) { storedFreeList = loadObject(header.free_list_page); trace("loaded free page list: %s ", storedFreeList); allocator.setFreeRanges(storedFreeList); Extent.unfree(pageFile, header.free_list_page); } else { allocator.clear(); storedFreeList.add(0, allocator.getLimit()); } int pageId = header.pessimistic_recovery_page; if (header.optimistic_recovery_page >= 0) { pageId = header.optimistic_recovery_page; } LinkedList<Batch> loaded = new LinkedList<Batch>(); boolean consistencyCheckNeeded = true; while (pageId >= 0) { trace("loading batch at: %d", pageId); Batch batch = null; if (pageId == header.pessimistic_recovery_page) { consistencyCheckNeeded = false; } if (consistencyCheckNeeded) { // write could be corrupted.. lets be careful try { batch = loadObject(pageId); } catch (Exception e) { trace("incomplete batch at: %d", pageId); // clear out any previously loaded batchs.. and // resume from the pessimistic location. loaded.clear(); pageId = header.pessimistic_recovery_page; continue; } } else { // it should load fine.. batch = loadObject(pageId); } batch.page = pageId; batch.recovered = true; loaded.add(batch); trace("loaded batch: %s", batch); // is this the last batch we need to load? if (header.base_revision + 1 == batch.base) { break; } pageId = batch.previous; } if (loaded.isEmpty()) { trace("no batches need to be recovered."); } else { // link up the batch objects... for (Batch batch : loaded) { // makes sure the batch pages are not in the free list. Extent.unfree(pageFile, batch.page); if (openBatch.head == -1) { openBatch.head = batch.head; } // add first since we are loading batch objects oldest to youngest // but want to put them in the list youngest to oldest. batches.addFirst(batch); performedBatches = storedBatches = batch; } // Perform the updates.. performBatches(); syncBatches(); } } } /* (non-Javadoc) * @see org.fusesource.hawtdb.internal.page.TransactionalPageFile#flush() */ public void flush() { synchronized (HOUSE_KEEPING_MUTEX) { storeBatches(true); syncBatches(); } } public void flush(final Runnable onComplete) { if (worker != null) { worker.execute(new Runnable() { public void run() { flush(); onComplete.run(); } }); } else { flush(); onComplete.run(); } } // ///////////////////////////////////////////////////////////////// // // Methods which transition bathes through their life cycle states: // open -> storing -> stored -> performing -> performed -> released // // state: open - you can add additional commits to the batch // // on: batch size limit reached // action: write the batch to disk // update optimistic_recovery_page // // state: storing - batch was written to disk, but not synced.. batch may be lost on failure. // // on: disk sync // action: update pessimistic_recovery_page // // state: stored - we know know the batch can be recovered. Updates will not be lost once we hit this state. // // on: original pages drained of open snapshots // action: copy shadow pages to original pages // // state: performing - original pages are being updated. Updates might be partially applied. // // on: disk sync // // state performed: original pages no updated. // // action: the batch becomes the base revision, new snapshot can refer to the original page locations. // // on: batch drained of open snapshots // // state: released - The batch is no longer being used. // // action: free the batch shadow pages // // // ///////////////////////////////////////////////////////////////// /** * Attempts to perform a batch state change: open -> storing */ private void storeBatches(boolean force) { Batch batch; // We synchronized /w the transactions so that they see the state change. synchronized (TRANSACTION_MUTEX) { // Re-checking since storing the batch may not be needed. if ((force && openBatch.base != -1) || openBatch.pageCount() > updateBatchSize) { batch = openBatch; openBatch = new Batch(batch.head); batches.addLast(openBatch); } else { return; } } // Write any outstanding deferred cache updates... batch.performDeferredUpdates(pageFile); // Link it to the last batch. batch.previous = lastBatchPage; // Store the batch record. lastBatchPage = batch.page = storeObject(batch); trace("stored batch: %s", batch); // Update the header to know about the new batch page. header.optimistic_recovery_page = batch.page; storeHeader(); } /** * Performs a file sync. * * This allows two types of batch state changes to occur: * <ul> * <li> storing -> stored * <li> performed -> released * </ul> */ private void syncBatches() { // This is a slow operation.. if (synch) { file.sync(); } // Update the base_revision with the last performed revision. if (performedBatches != storedBatches) { Batch lastPerformedBatch = storedBatches.getPrevious(); header.base_revision = lastPerformedBatch.head; } // Were there some batches in the stored state? if (storingBatches != openBatch) { // Callback the runnables which were waiting for the updates to be // fully flushed to disk. Batch cur = storingBatches; while (cur != openBatch) { for (Runnable runnable : storingBatches.flushCallbacks) { try { runnable.run(); } catch (Throwable e) { e.printStackTrace(); } } cur = cur.getNext(); } // The last stored is actually synced now.. Batch lastStoredBatch = openBatch.getPrevious(); // Let the header know about it.. header.pessimistic_recovery_page = lastStoredBatch.page; if (header.optimistic_recovery_page == header.pessimistic_recovery_page) { header.optimistic_recovery_page = -1; } // We synchronized /w the transactions so that they see the state change. synchronized (TRANSACTION_MUTEX) { // Transition stored -> synced. storingBatches = openBatch; } } // apply any batches that can be applied.. performBatches(); // Once a batch has been performed, subsequently synced, and no longer referenced, // it's allocated recovery space can be released. while (performedBatches != storedBatches) { if (performedBatches.snapshots != 0) { break; } if (performedBatches.page == header.pessimistic_recovery_page) { header.pessimistic_recovery_page = -1; } // Free the update pages associated with the batch. performedBatches.release(allocator); // Free the batch record itself. Extent.free(pageFile, performedBatches.page); // don't need to sync /w transactions since they don't use the performedBatches variable. // Transition performed -> released performedBatches = performedBatches.getNext(); // removes the released batch form the batch list. performedBatches.getPrevious().unlink(); } // Store the free list.. int previousFreeListPage = header.free_list_page; header.free_list_page = storeObject(storedFreeList); storeHeader(); // Release the previous free list. if (previousFreeListPage >= 0) { Extent.free(pageFile, previousFreeListPage); } } /** * Attempts to perform a batch state change: stored -> performed * * Once a batch is performed, new snapshots will not reference * the batch anymore. */ public void performBatches() { if (storedBatches == storingBatches) { // There are no batches in the synced state for use to transition. return; } // The last performed batch MIGHT still have an open snapshot. // we can't transition from synced, until that snapshot closes. Batch lastPerformed = storedBatches.getPrevious(); if (lastPerformed != null && lastPerformed.snapshots != 0) { return; } while (storedBatches != storingBatches) { trace("Performing batch: %s", storedBatches); // Performing the batch actually applies the updates to the original page locations. for (Commit commit : storedBatches) { for (Entry<Integer, Update> entry : commit.updates.entrySet()) { int page = entry.getKey(); Update update = entry.getValue(); if (traced(page) || (update.shadowed() && traced(update.shadow()))) { trace("performing update at %d %s", page, update); } // is it a shadow update? if (update.shadowed()) { if (storedBatches.recovered) { // If we are recovering, the allocator MIGHT not have the shadow // page as being allocated. This makes sure it's allocated so that // new transaction to get this page and overwrite it in error. allocator.unfree(update.shadow(), 1); } // Perform the update by copying the updated page the original // page location. if (traced(page) || traced(update.shadow())) { trace("performing shadow update on %d from %d", page, update.shadow()); } ByteBuffer slice = pageFile.slice(SliceType.READ, update.shadow(), 1); try { pageFile.write(page, slice); } finally { pageFile.unslice(slice); } } if (update.allocated()) { if (storedBatches.recovered) { // If we are recovering, the allocator MIGHT not have this // page as being allocated. This makes sure it's allocated so that // new transaction to get this page and overwrite it in error. allocator.unfree(page, 1); } // Update the persistent free list. This gets stored on the next sync. storedFreeList.remove(page, 1); } else if (update.freed()) { storedFreeList.add(page, 1); } // update the read cache.. DeferredUpdate du = update.deferredUpdate(); if (du != null) { if (du.removed()) { readCache.cache().remove(page); } else if (du.put()) { readCache.cache().put(page, du.value); } } } } storedBatches.performed = true; // We synchronized /w the transactions so that they see the state change. synchronized (TRANSACTION_MUTEX) { // Transition synced -> performed storedBatches = storedBatches.getNext(); } lastPerformed = storedBatches.getPrevious(); // We have to stop if the last batch performed has an open snapshot. if (lastPerformed.snapshots != 0) { break; } } } // ///////////////////////////////////////////////////////////////// // Snapshot management // ///////////////////////////////////////////////////////////////// Snapshot openSnapshot() { synchronized (TRANSACTION_MUTEX) { // re-use the last entry if it was a snapshot head.. Commit commit = openBatch.getHeadCommit(); SnapshotTracker tracker = null; if (commit != null) { if (commit.snapshotTracker == null) { // So we can track the new snapshot... commit.snapshotTracker = new SnapshotTracker(openBatch, commit); } tracker = commit.snapshotTracker; } else { tracker = new SnapshotTracker(openBatch, null); } // Open the snapshot return new Snapshot(this, tracker, storedBatches, openBatch).open(); } } // ///////////////////////////////////////////////////////////////// // Helper methods // ///////////////////////////////////////////////////////////////// private int storeObject(Object value) { try { ExtentOutputStream eos = new ExtentOutputStream(pageFile); ObjectOutputStream oos = new ObjectOutputStream(eos); oos.writeObject(value); oos.close(); return eos.getPage(); } catch (IOException e) { throw new IOPagingException(e); } } @SuppressWarnings("unchecked") private <T> T loadObject(int pageId) { try { ExtentInputStream eis = new ExtentInputStream(pageFile, pageId); ObjectInputStream ois = new ObjectInputStream(eis); return (T) ois.readObject(); } catch (IOException e) { throw new IOPagingException(e); } catch (ClassNotFoundException e) { throw new IOPagingException(e); } } private void storeHeader() { trace("storing file header: %s", header); file.write(0, header.encode()); } }